'Walsh Hadamard transform code for low end AMD64 CPUs.
'Linux assembly language and FreeBasic 1.09.0
'hsixteen subroutine
' haddps and hsubps are kind of slow on such cpus.
' using pshufd and some sign fliping is faster.
'beyond4 subroutine
' ReLU switches the forward connected weights on and off.
' Beyond ReLU switches between forward connected weights (mem*2).
' Beyond4 computes multiple width 4 Beyond ReLU layers.

sub wht16(vec as single ptr,n as ulongint)
  dim as ulongint hs = 16
  while hs < n
    dim as ulongint i
    while i < n 
      dim as ulongint j = i + hs
      while i < j
        dim as single a = vec[i]
        dim as single b = vec[i + hs]
        vec[i] = a + b
        vec[i + hs] = a - b
        i += 1
      wend
      i += hs
    wend
    hs += hs
  wend
end sub

sub wht32(vec as single ptr,n as ulongint)
  dim as ulongint hs = 32
  while hs < n
    dim as ulongint i
    while i < n 
      dim as ulongint j = i + hs
      while i < j
        dim as single a = vec[i]
        dim as single b = vec[i + hs]
        vec[i] = a + b
        vec[i + hs] = a - b
        i += 1
      wend
      i += hs
    wend
    hs += hs
  wend
end sub

sub hsixteen naked(x as single ptr, n as ulongint)
asm
   movaps xmm8,xcf[rip]
   movaps xmm9,xcf[rip+16]
   movaps xmm10,xcf[rip+2*16]
  .align 16
h16blp:
  movaps xmm0,[rdi]
  movaps xmm4,[rdi+16]
  pshufd xmm1,xmm0,&b01010101
  pshufd xmm2,xmm0,&b10101010
  pshufd xmm3,xmm0,&b11111111
  pshufd xmm0,xmm0,0
  pshufd xmm5,xmm4,&b01010101
  pshufd xmm6,xmm4,&b10101010
  pshufd xmm7,xmm4,&b11111111
  pshufd xmm4,xmm4,0
  pxor xmm1,xmm8
  pxor xmm2,xmm9
  pxor xmm3,xmm10
  pxor xmm5,xmm8
  pxor xmm6,xmm9
  pxor xmm7,xmm10
  addps xmm0,xmm1
  addps xmm4,xmm5
  addps xmm2,xmm3
  addps xmm6,xmm7
  movaps xmm1,[rdi+2*16]
  movaps xmm5,[rdi+3*16]
  addps xmm0,xmm2 ' a
  addps xmm4,xmm6 ' b
  
  
  pshufd xmm2,xmm1,&b10101010
  pshufd xmm3,xmm1,&b11111111
  pshufd xmm11,xmm1,0
  pshufd xmm1,xmm1,&b01010101
  
  
  pshufd xmm6,xmm5,&b10101010
  pshufd xmm7,xmm5,&b11111111
  pshufd xmm12,xmm5,0
  pshufd xmm5,xmm5,&b01010101
  pxor xmm1,xmm8
  pxor xmm2,xmm9
  pxor xmm3,xmm10
  pxor xmm5,xmm8
  pxor xmm6,xmm9
  pxor xmm7,xmm10
  
  addps xmm11,xmm1
  addps xmm12,xmm5
  addps xmm2,xmm3
  addps xmm6,xmm7
  sub rsi,16
  addps xmm11,xmm2 'c
  addps xmm12,xmm6 'd
  
  movaps xmm1,xmm0  'a
  movaps xmm2,xmm11 'c
  addps xmm0,xmm4   'a+b
  subps xmm1,xmm4   'a-b
  addps xmm11,xmm12 'c+d
  subps xmm2,xmm12  'c-d
  
  movaps xmm3,xmm0  'a+b
  movaps xmm5,xmm1  'a-b
  addps xmm0,xmm11  'a+b+c+d
  subps xmm3,xmm11  'a+b-c-d
  
  addps xmm1,xmm2	'a-b+c-d
  subps xmm5,xmm2   'a-b-c+d
  
  movaps [rdi],xmm0
  movaps [rdi+16],xmm1
  movaps [rdi+2*16],xmm3
  movaps [rdi+3*16],xmm5
  
  lea rdi,[rdi+64]
  jnz h16blp
  ret
  .align 16
xcf:
  .long 0,&h80000000,0,&h80000000
  .long 0,0,&h80000000,&h80000000 
  .long 0,&h80000000,&h80000000,0
end asm
end sub

sub wht81632 naked (x as single ptr, n as ulongint)
asm	
	.align 16
w81632:
	movaps xmm0,[rdi]
	movaps xmm1,[rdi+16]
	movaps xmm2,[rdi+2*16]
	movaps xmm3,[rdi+3*16]
	movaps xmm4,[rdi+4*16]
	movaps xmm5,[rdi+5*16]
	movaps xmm6,[rdi+6*16]
	movaps xmm7,[rdi+7*16]
	sub rsi,32
	movaps xmm8,xmm0
	movaps xmm9,xmm2
	movaps xmm10,xmm4
	movaps xmm11,xmm6
	addps xmm0,xmm1 '0
	addps xmm2,xmm3 '2
	addps xmm4,xmm5 '4
	addps xmm6,xmm7 '6
	subps xmm8,xmm1 '1
	subps xmm9,xmm3 '3
	subps xmm10,xmm5'5
	subps xmm11,xmm7'7
	movaps xmm12,xmm0 '0
	movaps xmm13,xmm8 '1
	movaps xmm14,xmm4 '4
	movaps xmm15,xmm10'5
	addps xmm0,xmm2  '0
	addps xmm4,xmm6  '4
	addps xmm8,xmm9  '1
	addps xmm10,xmm11'5
	subps xmm12,xmm2 '2
	subps xmm13,xmm9 '3
	subps xmm14,xmm6 '6
	subps xmm15,xmm11'7
	
	movaps xmm1,xmm0  '0	
	movaps xmm2,xmm8  '1
	movaps xmm3,xmm12 '2	
	movaps xmm5,xmm13 '3
	addps xmm0,xmm4	  '0
	addps xmm8,xmm10  '1
	addps xmm12,xmm14 '2
	addps xmm13,xmm15 '3
	subps xmm1,xmm4	  '4
	subps xmm2,xmm10  '5
	subps xmm3,xmm14  '6
	subps xmm5,xmm15  '7
	
	movaps [rdi],xmm0
	movaps [rdi+16],xmm8
	movaps [rdi+2*16],xmm12
	movaps [rdi+3*16],xmm13
	movaps [rdi+4*16],xmm1
	movaps [rdi+5*16],xmm2
	movaps [rdi+6*16],xmm3
	movaps [rdi+7*16],xmm5
	lea rdi,[rdi+128]
    jnz w81632
    ret
end asm
end sub
	
'32 elements minimum	
sub	wht4(x as single ptr,n as ulongint)
  wht81632(x,n)
  wht32(x,n)
end sub  
'16 element min
sub wht(x as single ptr,n as ulongint)
  hsixteen(x,n)
  wht16(x,n)
end sub


sub beyond4 naked(vec as single ptr,wts as single ptr,n as ulongint)
asm
mov eax,16
.align 16
bey4clp:
movaps xmm4,[rdi]
mov ecx,[rdi]
mov r8d,[rdi+4]
mov r9d,[rdi+8]
mov r10d,[rdi+12]
pshufd xmm0,xmm4,0
pshufd xmm1,xmm4,&b01010101
pshufd xmm2,xmm4,&b10101010
pshufd xmm3,xmm4,&b11111111
shr ecx,27
shr r8d,27
shr r9d,27
shr r10d,27
and ecx,eax
and r8d,eax
and r9d,eax
and r10d,eax
mulps xmm0,[rsi+rcx]
mulps xmm1,[rsi+r8+32]
mulps xmm2,[rsi+r9+2*32]
mulps xmm3,[rsi+r10+3*32]
addps xmm0,xmm1
addps xmm2,xmm3
sub rdx,4
lea rsi,[rsi+4*32]
addps xmm0,xmm2
movaps [rdi],xmm0
lea rdi,[rdi+16]
jnz bey4clp
ret
end asm
end sub

sub copy(result as single ptr,x as single ptr,n as ulongint)
	for i as ulongint=0 to n-1
		result[i]=x[i]
	next
end sub

sub multiply(result as single ptr,x as single ptr,y as single ptr,n as ulongint)
	for i as ulongint=0 to n-1
		result[i]=x[i]*y[i]
	next
end sub

function errorl2(x as single ptr,y as single ptr,n as ulongint) as single
  dim as single sum
  for i as ulongint=0 to n-1
    dim as single e=x[i]-y[i]
    sum+=e*e
  next
  return sum
end function  

